Introduction to R

Intro to Modeling

Author

Juan Steibel and Austin Putz

Published

January 18, 2023

# remove all objects if restarting script
rm(list=ls())

# garbage collection (clean memory/RAM)
gc()
          used (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
Ncells 1115522 59.6    2212632 118.2         NA  2212632 118.2
Vcells 1906886 14.6    8388608  64.0      65536  2827891  21.6
# print session info (all versions)
sessionInfo()
R version 4.2.1 (2022-06-23)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Big Sur ... 10.16

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] forcats_0.5.1   stringr_1.4.1   dplyr_1.0.10    purrr_0.3.4    
 [5] readr_2.1.2     tidyr_1.2.1     tibble_3.1.8    tidyverse_1.3.2
 [9] lubridate_1.8.0 ggthemes_4.2.4  plotly_4.10.0   ggplot2_3.3.6  
[13] knitr_1.39     

loaded via a namespace (and not attached):
 [1] tidyselect_1.1.2    xfun_0.31           haven_2.5.0        
 [4] gargle_1.2.0        colorspace_2.0-3    vctrs_0.4.2        
 [7] generics_0.1.3      htmltools_0.5.3     viridisLite_0.4.1  
[10] yaml_2.3.5          utf8_1.2.2          rlang_1.0.6        
[13] pillar_1.8.1        glue_1.6.2          withr_2.5.0        
[16] DBI_1.1.3           dbplyr_2.2.1        readxl_1.4.0       
[19] modelr_0.1.8        lifecycle_1.0.2     cellranger_1.1.0   
[22] munsell_0.5.0       gtable_0.3.1        rvest_1.0.3        
[25] htmlwidgets_1.5.4   evaluate_0.16       tzdb_0.3.0         
[28] fastmap_1.1.0       fansi_1.0.3         broom_1.0.0        
[31] scales_1.2.1        backports_1.4.1     googlesheets4_1.0.0
[34] jsonlite_1.8.2      fs_1.5.2            hms_1.1.1          
[37] digest_0.6.29       stringi_1.7.8       grid_4.2.1         
[40] cli_3.4.1           tools_4.2.1         magrittr_2.0.3     
[43] lazyeval_0.2.2      crayon_1.5.2        pkgconfig_2.0.3    
[46] ellipsis_0.3.2      xml2_1.3.3          data.table_1.14.2  
[49] reprex_2.0.1        googledrive_2.0.0   assertthat_0.2.1   
[52] rmarkdown_2.14      httr_1.4.4          rstudioapi_0.13    
[55] R6_2.5.1            compiler_4.2.1     
#==============================================================================#
# Setup Options
#==============================================================================#

# set tibble width for printing
options(tibble.width = Inf)

# remove scientific notation
options(scipen=999)

#==============================================================================#
# Packages
#==============================================================================#

# load library
#library(JuliaCall)
library(GGally)
library(datasets) 
library(plotly)
library(ggthemes)
library(lubridate)
library(tidyverse)

#==============================================================================#
# Set paths
#==============================================================================#

# set all paths
path_main    <- "~/Documents/ISU/Classes/AnS_562/2023/Part_A/R/"
path_data    <- paste0(path_main, "Data/")
path_plots   <- paste0(path_main, "Plots/")
path_output  <- paste0(path_main, "Output/")
path_results <- paste0(path_main, "Results/")
path_scripts <- paste0(path_main, "Scripts/")

# set working directory
setwd(path_main)


Dataset

# pull data from datasets package
data ("longley")

# library(arrow)
# # write out arrow file
# write_feather(
#   x = longley,
#   sink = "longley.arrow")

# print help on longley dataset
?longley

# print dimensions (number of rows x number of columns)
dim(longley)
[1] 16  7
# print first rows of dataset
head(longley)
     GNP.deflator     GNP Unemployed Armed.Forces Population Year Employed
1947         83.0 234.289      235.6        159.0    107.608 1947   60.323
1948         88.5 259.426      232.5        145.6    108.632 1948   61.122
1949         88.2 258.054      368.2        161.6    109.773 1949   60.171
1950         89.5 284.599      335.1        165.0    110.929 1950   61.187
1951         96.2 328.975      209.9        309.9    112.075 1951   63.221
1952         98.1 346.999      193.2        359.4    113.270 1952   63.639
# print correlation of all variables
pairs(longley, main = "Scatter plot with all variables")

# plot all correlations
ggpairs(longley, progress=FALSE) # from GGally package

# plot Employed on GNP from langley dataset
longley %>%
ggplot(., aes(x=GNP,y=Employed)) + 
  geom_point() +    # points for scatter
  geom_smooth(method = "lm") # plot smooth linear model line to plot




Model

We will be fitting the following

\begin{equation} \boldsymbol{y} = \boldsymbol{Xb} + \boldsymbol{e} \end{equation}

where y is the response variable (Employed column), X is an incidence matrix (will cover later), b is a vector of coefficients we are solving for (intercept and slope), and finally e is the residual (not predicted from the model).

# Fit a very simple linear model:
# y = intercept + slope*GNP + e
srm <- lm(Employed ~ GNP, data=longley)

# print summary of model
summary(srm)

Call:
lm(formula = Employed ~ GNP, data = longley)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.77958 -0.55440 -0.00944  0.34361  1.44594 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 51.843590   0.681372   76.09  < 2e-16 ***
GNP          0.034752   0.001706   20.37 8.36e-12 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.6566 on 14 degrees of freedom
Multiple R-squared:  0.9674,    Adjusted R-squared:  0.965 
F-statistic: 415.1 on 1 and 14 DF,  p-value: 8.363e-12
# what is under the hood:
names(srm)
 [1] "coefficients"  "residuals"     "effects"       "rank"         
 [5] "fitted.values" "assign"        "qr"            "df.residual"  
 [9] "xlevels"       "call"          "terms"         "model"        
# get a more in depth view of this 'object'
str(srm)
List of 12
 $ coefficients : Named num [1:2] 51.8436 0.0348
  ..- attr(*, "names")= chr [1:2] "(Intercept)" "GNP"
 $ residuals    : Named num [1:16] 0.3373 0.2628 -0.6406 -0.5471 -0.0552 ...
  ..- attr(*, "names")= chr [1:16] "1947" "1948" "1949" "1950" ...
 $ effects      : Named num [1:16] -261.268 13.378 -0.748 -0.644 -0.134 ...
  ..- attr(*, "names")= chr [1:16] "(Intercept)" "GNP" "" "" ...
 $ rank         : int 2
 $ fitted.values: Named num [1:16] 60 60.9 60.8 61.7 63.3 ...
  ..- attr(*, "names")= chr [1:16] "1947" "1948" "1949" "1950" ...
 $ assign       : int [1:2] 0 1
 $ qr           :List of 5
  ..$ qr   : num [1:16, 1:2] -4 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 ...
  .. ..- attr(*, "dimnames")=List of 2
  .. .. ..$ : chr [1:16] "1947" "1948" "1949" "1950" ...
  .. .. ..$ : chr [1:2] "(Intercept)" "GNP"
  .. ..- attr(*, "assign")= int [1:2] 0 1
  ..$ qraux: num [1:2] 1.25 1.25
  ..$ pivot: int [1:2] 1 2
  ..$ tol  : num 1e-07
  ..$ rank : int 2
  ..- attr(*, "class")= chr "qr"
 $ df.residual  : int 14
 $ xlevels      : Named list()
 $ call         : language lm(formula = Employed ~ GNP, data = longley)
 $ terms        :Classes 'terms', 'formula'  language Employed ~ GNP
  .. ..- attr(*, "variables")= language list(Employed, GNP)
  .. ..- attr(*, "factors")= int [1:2, 1] 0 1
  .. .. ..- attr(*, "dimnames")=List of 2
  .. .. .. ..$ : chr [1:2] "Employed" "GNP"
  .. .. .. ..$ : chr "GNP"
  .. ..- attr(*, "term.labels")= chr "GNP"
  .. ..- attr(*, "order")= int 1
  .. ..- attr(*, "intercept")= int 1
  .. ..- attr(*, "response")= int 1
  .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
  .. ..- attr(*, "predvars")= language list(Employed, GNP)
  .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
  .. .. ..- attr(*, "names")= chr [1:2] "Employed" "GNP"
 $ model        :'data.frame':  16 obs. of  2 variables:
  ..$ Employed: num [1:16] 60.3 61.1 60.2 61.2 63.2 ...
  ..$ GNP     : num [1:16] 234 259 258 285 329 ...
  ..- attr(*, "terms")=Classes 'terms', 'formula'  language Employed ~ GNP
  .. .. ..- attr(*, "variables")= language list(Employed, GNP)
  .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
  .. .. .. ..- attr(*, "dimnames")=List of 2
  .. .. .. .. ..$ : chr [1:2] "Employed" "GNP"
  .. .. .. .. ..$ : chr "GNP"
  .. .. ..- attr(*, "term.labels")= chr "GNP"
  .. .. ..- attr(*, "order")= int 1
  .. .. ..- attr(*, "intercept")= int 1
  .. .. ..- attr(*, "response")= int 1
  .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
  .. .. ..- attr(*, "predvars")= language list(Employed, GNP)
  .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
  .. .. .. ..- attr(*, "names")= chr [1:2] "Employed" "GNP"
 - attr(*, "class")= chr "lm"
# Obtain Residuals:
# e = y - (intercept_hat + x*beta_hat)
# add residuals to original dataset 
longley$e_hat <- residuals(srm)

# Obtain Predicted Values (y_hat):
# (intercept_hat + x*beta_hat)
longley$y_hat <- predict(srm)

# print first lines of dataset
head(longley)
     GNP.deflator     GNP Unemployed Armed.Forces Population Year Employed
1947         83.0 234.289      235.6        159.0    107.608 1947   60.323
1948         88.5 259.426      232.5        145.6    108.632 1948   61.122
1949         88.2 258.054      368.2        161.6    109.773 1949   60.171
1950         89.5 284.599      335.1        165.0    110.929 1950   61.187
1951         96.2 328.975      209.9        309.9    112.075 1951   63.221
1952         98.1 346.999      193.2        359.4    113.270 1952   63.639
           e_hat    y_hat
1947  0.33732993 59.98567
1948  0.26276150 60.85924
1949 -0.64055835 60.81156
1950 -0.54705800 61.73406
1951 -0.05522581 63.27623
1952 -0.26360117 63.90260


Diagnostics

# three commonly used diagnostic plots

# residual on Employed
ggplot(longley,aes(x=Employed, y=e_hat)) + 
  geom_point() + 
  geom_smooth(method = "lm")

# residual on y_hat
ggplot(longley, aes(x=y_hat, y=e_hat)) + 
  geom_point() + 
  geom_smooth(method = "lm")

# residual on Year
ggplot(longley, aes(x=Year, y=e_hat)) + 
  geom_point() + 
  geom_smooth(method = "lm")

# running plot on the model will produce 6 diagnostic plots you should check
par(mfrow=c(2,3))
plot(srm, which=1:6)

par(mfrow=c(1,1))
# histogram of residuals
longley %>%
ggplot(., aes(x=e_hat)) +
  geom_histogram(fill="dodgerblue3", color="white", binwidth=0.25) +
  geom_vline(xintercept = 0, linetype=2, color="red") +
  labs(
    title = "Histogram of Residuals",
    subtitle = "y = Xb + e Model",
    x = "Residual from Model",
    y = "Count",
    caption = "Longley dataset"
  )

# histogram of y_hat
longley %>%
ggplot(., aes(x=y_hat)) +
  geom_histogram(fill="dodgerblue3", color="white", binwidth=2) +
  labs(
    title = "Histogram of Y hat Values",
    subtitle = "y = Xb + e Model",
    x = "Y hat Values from Model",
    y = "Count",
    caption = "Longley dataset"
  )

# density of y_hat
longley %>%
  select(Employed, y_hat) %>%
  gather() %>%
ggplot(., aes(x=value, fill=key)) +
  geom_density(color="white", alpha=0.2) +
  scale_fill_discrete("Column", labels = c("y = Employed", "Y hat")) +
  labs(
    title = "Histogram of Observed and Y hat Values",
    subtitle = "y = Xb + e Model",
    x = "Y hat Values from Model",
    y = "Count",
    caption = "Longley dataset"
  )




Simulation


Basic Simulation

This simulation will assumed the model coefficients are fixed.

#------------------------------------------------------------------------------#
# simulation to check the model
#------------------------------------------------------------------------------#

# extract linear coefficients from model (intercept and slope term)
coefs <- srm$coefficients

# extract residual sd. note it's not directly available
sigma_e <- summary(srm)$sigma 

# predict each value given the model
y_hat <- coefs[1] + (coefs[2]*longley$GNP)

# print y_hat values
y_hat
 [1] 59.98567 60.85924 60.81156 61.73406 63.27623 63.90260 64.54156 64.46256
 [9] 65.65655 66.41106 67.23083 67.29258 68.61866 69.31013 69.85129 71.12743
# add y_hat to dataset
longley$y_hat
 [1] 59.98567 60.85924 60.81156 61.73406 63.27623 63.90260 64.54156 64.46256
 [9] 65.65655 66.41106 67.23083 67.29258 68.61866 69.31013 69.85129 71.12743
# Simulate residuals nrep times given model estimate of sigma_e (residual variance)
nrep <- 100
e_m  <- replicate(n=nrep, rnorm(nrow(longley), mean = 0, sd = sigma_e))
dim(e_m)
[1]  16 100
# generate new predicted values given residual samples
p_m <- y_hat + e_m
#p_m

# boxplot - predicted on GNP
boxplot(p_m ~ longley$GNP)

# boxplot - predicted on GNP
boxplot(p_m ~ longley$GNP, at=longley$GNP)
abline(coefs)
points(longley$GNP, longley$Employed, pch=19, col="red")


Advanced Simulation

This simulation will not assumed the coefficients from the model are fixed. The following will generate new estimates of the coefficients given the variance of the estimates.

# Simulate multiple regression lines (?)
# this is a more advanced topic for which we need matrix algebra and some 
# OLS and model fitting results. We will come back to this in 2 weeks. 

# (co)variance matrix of model coefficients
vcv <- vcov(srm)

# print vcv matrix
print(vcv)
             (Intercept)           GNP
(Intercept)  0.464267307 -1.127991e-03
GNP         -0.001127991  2.909454e-06
# sqrt diagonals to get SE of coefficients
vcov(srm) %>%
  diag() %>%
  sqrt()
(Intercept)         GNP 
0.681371636 0.001705712 
# set number of reps to simulate
nrep <- 1000

# choleski decomposition of variance-covariance
L <- (chol(vcv))

# simulate new coefficients from the model (not fixed anymore)

# create 1000 x 2 matrix of standard normal samples
n01 <- matrix(rnorm(2*nrep), nrep, 2)

# dim of n01
dim(n01)
[1] 1000    2
# multiply this matrix by chol decomposed vcv matrix
beta_sim <- n01 %*% L # %*% is matrix multiplication

# add coefficients to this
beta_sim[,1] <- beta_sim[,1] + coefs[1] # add intercept estimate
beta_sim[,2] <- beta_sim[,2] + coefs[2] # add slope estimate
head(beta_sim)
     (Intercept)        GNP
[1,]    52.11163 0.03460838
[2,]    50.24527 0.03876970
[3,]    52.10188 0.03360805
[4,]    50.68452 0.03747766
[5,]    51.00210 0.03644758
[6,]    51.06197 0.03609400
# check covariance
cov(beta_sim)
             (Intercept)           GNP
(Intercept)  0.492650186 -1.189365e-03
GNP         -0.001189365  3.036710e-06
# should be the same as vcv
vcv
             (Intercept)           GNP
(Intercept)  0.464267307 -1.127991e-03
GNP         -0.001127991  2.909454e-06
# print column means of simulated coefficients (should be close to the model estimates)
colMeans(beta_sim)
(Intercept)         GNP 
51.87315604  0.03468494 
# plot regression lines + average line + observed points
plot(longley$GNP,longley$Employed, type="none",
     main = "Employed on GNP", xlab = "GNP", ylab = "Employed")
abline(coefs, lwd=2, col="red")
apply(beta_sim, 1, function(x) abline(x,col="gray"))
NULL
abline(coefs, lwd=2, col="red")
points(longley$GNP, longley$Employed, pch=19)